Biostat 203B Homework 3

Due Feb 23 @ 11:59PM

Author

Wenbo Zhao 806074910

Display machine information for reproducibility:

sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: America/Los_Angeles
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] htmlwidgets_1.6.4 compiler_4.3.2    fastmap_1.1.1     cli_3.6.2        
 [5] tools_4.3.2       htmltools_0.5.7   rstudioapi_0.15.0 yaml_2.3.8       
 [9] rmarkdown_2.25    knitr_1.45        jsonlite_1.8.8    xfun_0.42        
[13] digest_0.6.34     rlang_1.1.3       evaluate_0.23    

Load necessary libraries (you can add more as needed).

library(arrow)
library(gtsummary)
library(memuse)
library(pryr)
library(R.utils)
library(tidyverse)
library(lubridate)

Display your machine memory.

memuse::Sys.meminfo()
Totalram:  15.621 GiB 
Freeram:   14.452 GiB 

In this exercise, we use tidyverse (ggplot2, dplyr, etc) to explore the MIMIC-IV data introduced in homework 1 and to build a cohort of ICU stays.

Q1. Visualizing patient trajectory

Visualizing a patient’s encounters in a health care system is a common task in clinical data analysis. In this question, we will visualize a patient’s ADT (admission-discharge-transfer) history and ICU vitals in the MIMIC-IV data.

Q1.1 ADT history

A patient’s ADT history records the time of admission, discharge, and transfer in the hospital. This figure shows the ADT history of the patient with subject_id 10001217 in the MIMIC-IV data. The x-axis is the calendar time, and the y-axis is the type of event (ADT, lab, procedure). The color of the line segment represents the care unit. The size of the line segment represents whether the care unit is an ICU/CCU. The crosses represent lab events, and the shape of the dots represents the type of procedure. The title of the figure shows the patient’s demographic information and the subtitle shows top 3 diagnoses.

Do a similar visualization for the patient with subject_id 10013310 using ggplot.

Hint: We need to pull information from data files patients.csv.gz, admissions.csv.gz, transfers.csv.gz, labevents.csv.gz, procedures_icd.csv.gz, diagnoses_icd.csv.gz, d_icd_procedures.csv.gz, and d_icd_diagnoses.csv.gz. For the big file labevents.csv.gz, use the Parquet format you generated in Homework 2. For reproducibility, make the Parquet folder labevents_pq available at the current working directory hw3, for example, by a symbolic link. Make your code reproducible.

ANSWER:

First we read the data and filter by subject_id:

#Read the data for subject_id: 10013310
id <- 10013310
#id <- 10001217

data_path <- "~/mimic/hosp/"
patients_file_path <- paste0(data_path, "patients.csv.gz")
admissions_file_path <- paste0(data_path, "admissions.csv.gz")
transfers_file_path <- paste0(data_path, "transfers.csv.gz")
labevents_file_path <- paste0(data_path, "labevents.parquet")
procedures_icd_file_path <- paste0(data_path, "procedures_icd.csv.gz")
d_icd_procedures_file_path <- paste0(data_path, "d_icd_procedures.csv.gz")
diagnoses_icd_file_path <- paste0(data_path, "diagnoses_icd.csv.gz")
d_icd_diagnoses_file_path <- paste0(data_path, "d_icd_diagnoses.csv.gz")

patients_data <- read_csv(patients_file_path) %>%
  filter(subject_id == id)

admissions_data <- read_csv(admissions_file_path) %>%
  filter(subject_id == id)

transfers_data <- read_csv(transfers_file_path) %>%
  filter(subject_id == id)

labevents_dataset <- open_dataset("labevents_pq", format = "parquet")
labevents_data <- labevents_dataset %>%
  filter(subject_id == id) %>%
  collect()

procedures_icd_data <- read_csv(procedures_icd_file_path) %>%
  filter(subject_id == id)

d_icd_procedures_data <- read_csv(d_icd_procedures_file_path)

diagnoses_icd_data <- read_csv(diagnoses_icd_file_path)%>%
  filter(subject_id == id)

d_icd_diagnoses_data <- read_csv(d_icd_diagnoses_file_path)
patients_data
admissions_data
transfers_data
labevents_data
procedures_icd_data
d_icd_procedures_data
diagnoses_icd_data
d_icd_diagnoses_data

We select the first admission data according to the admittime, this data is specified by hadm_id. Then we can extract ADT information.

#ADT
#Select the first admission:
first_admission <- admissions_data %>%
  arrange(admittime) %>%
  slice(1)

first_hadm_id <- first_admission$hadm_id

transfers_first_admission <- transfers_data %>%
  filter(hadm_id == first_hadm_id & eventtype != "discharge")

#Decide if the transfer belongs to ICU/CCU
icu_ccu_pattern <- "ICU|CCU"
transfers_first_admission <- transfers_first_admission %>%
  mutate(icu_ccu = if_else(grepl(icu_ccu_pattern, careunit, ignore.case = TRUE),
                           "ICU/CCU", "Other"))

#ADT data for plot
adt_transfer_data <- transfers_first_admission %>%
  mutate(event_start = intime,
         event_end = outtime,
         ) %>%
  select(subject_id, hadm_id, eventtype, icu_ccu, careunit, event_start,
         event_end) %>%
  arrange(event_start)

Extract patient information:

#Patient Information
race <- first_admission$race
age <- patients_data$anchor_age
gender <- patients_data$gender
title <- paste("Patient", id, ",", gender, ",", age, "years old,", race)

Extract lab events:

#Lab Event
lab_data <- labevents_data %>%
  select(labevent_id, hadm_id, subject_id, charttime) %>%
  filter(hadm_id == first_hadm_id) %>%
  arrange(charttime)

Extract procedures:

#Procedures
selected_procedures <- procedures_icd_data %>%
  select(subject_id, hadm_id, chartdate, icd_code) %>%
  filter(hadm_id == first_hadm_id)%>%
  left_join(d_icd_procedures_data, by = "icd_code") %>%
  select(subject_id, hadm_id, chartdate, icd_code, long_title)

selected_procedures$procedure_type <- as.factor(selected_procedures$long_title) 
selected_procedures$chartdate <- as.POSIXct(selected_procedures$chartdate, 
                                            tz = "UTC")

Find the top 3 diagnoses:

#Diagnoses
selected_diagnoses <- diagnoses_icd_data %>%
  select(subject_id, hadm_id, icd_code) %>%
  filter(hadm_id == first_hadm_id)%>%
  left_join(d_icd_diagnoses_data, by = "icd_code") %>%
  select(subject_id, hadm_id, icd_code, long_title) %>%
  slice(1:3)

top_diagnoses_titles <- selected_diagnoses$long_title
subtitle <- paste(top_diagnoses_titles, collapse = "\n")

Plot the graph:

# Plot
ggplot() +
  # ADT
  geom_segment(data = adt_transfer_data, 
               aes(x = event_start, xend = event_end,y = 'ADT', yend = 'ADT', 
                   color = careunit, size = icu_ccu), show.legend = TRUE) +
  scale_size_manual(values = c("ICU/CCU" = 5.0, "Other" = 1.0), guide = FALSE) +
  
  # LAB
  geom_point(data = lab_data, aes(x = charttime, y = 'LAB'), 
             shape = 3, size = 3, show.legend = FALSE) +
  
  # Procedure
  geom_point(data = selected_procedures, 
             aes(x = chartdate, y = 'Procedure', shape = procedure_type), 
             size = 3, show.legend = TRUE) +
  
  labs(title = title, subtitle = subtitle, x = "Calendar Time", y = "") +
  theme(legend.position = "bottom",
        legend.box = "vertical", 
        legend.box.background = element_rect(colour = "transparent"), 
        legend.margin = margin(t = 0, unit = "cm")) +
  guides(color = guide_legend(title = "Care Unit", 
                              override.aes = list(shape = NA), ncol = 3),
         shape = guide_legend(title = "Procedure", 
                              override.aes = list(linetype = "blank", size = 3), 
                              ncol = 1)) +
  scale_y_discrete(limits = c('Procedure', 'LAB', 'ADT'))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
ggplot2 3.3.4.
ℹ Please use "none" instead.

Q1.2 ICU stays

ICU stays are a subset of ADT history. This figure shows the vitals of the patient 10001217 during ICU stays. The x-axis is the calendar time, and the y-axis is the value of the vital. The color of the line represents the type of vital. The facet grid shows the abbreviation of the vital and the stay ID.

ANSWER: Read chartevent data:

#Read chartevent data
chartevents_parquet <- open_dataset("chartevents_pq", format = "parquet")
  
parquet_data <- chartevents_parquet %>%
  select(subject_id, stay_id, itemid, charttime, valuenum) %>%
  filter(itemid %in% c(220045, 220181, 220179, 223761, 220210)) 

chartevents_tibble <- parquet_data %>% collect()

d_items_data <- read_csv("~/mimic/icu/d_items.csv.gz")
d_items_data <- d_items_data %>%
  select(itemid, label, abbreviation)
ICU_stays_data <- chartevents_tibble %>%
  left_join(d_items_data, by = "itemid")
ICU_stays_data

Extract patient data based on subject_id:

#Extract patient data
id <- 10013310
#id <- 10001217
patient_icu_stay <- ICU_stays_data %>%
  filter(subject_id == id)

For this plot, the point plot is abandoned for better readability.

ggplot(data = patient_icu_stay) + 
  geom_line(mapping = aes(x = charttime, y = valuenum, color = abbreviation)) +
  #geom_point(mapping = aes(x = charttime, y = valuenum, 
  #color = abbreviation), size = 1) + 
  facet_grid(abbreviation ~ stay_id, scales = "free") +
  labs(title = paste("Patient", id, "ICU stays - Vitals"),
       x = "Time",
       y = "Vital Value") +
  theme() +
  theme(legend.position = "none") + 
  guides(x = guide_axis(n.dodge = 2))

Q2. ICU stays

icustays.csv.gz (https://mimic.mit.edu/docs/iv/modules/icu/icustays/) contains data about Intensive Care Units (ICU) stays. The first 10 lines are

zcat < ~/mimic/icu/icustays.csv.gz | head
subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
10000032,29079034,39553978,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2180-07-23 14:00:00,2180-07-23 23:50:47,0.4102662037037037
10000980,26913865,39765666,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2189-06-27 08:42:00,2189-06-27 20:38:27,0.4975347222222222
10001217,24597018,37067082,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-11-20 19:18:02,2157-11-21 22:08:00,1.1180324074074075
10001217,27703517,34592300,Surgical Intensive Care Unit (SICU),Surgical Intensive Care Unit (SICU),2157-12-19 15:42:24,2157-12-20 14:27:41,0.9481134259259258
10001725,25563031,31205490,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2110-04-11 15:52:22,2110-04-12 23:59:56,1.338587962962963
10001884,26184834,37510196,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-01-11 04:20:05,2131-01-20 08:27:30,9.171817129629629
10002013,23581541,39060235,Cardiac Vascular Intensive Care Unit (CVICU),Cardiac Vascular Intensive Care Unit (CVICU),2160-05-18 10:00:53,2160-05-19 17:33:33,1.3143518518518518
10002155,20345487,32358465,Medical Intensive Care Unit (MICU),Medical Intensive Care Unit (MICU),2131-03-09 21:33:00,2131-03-10 18:09:21,0.8585763888888889
10002155,23822395,33685454,Coronary Care Unit (CCU),Coronary Care Unit (CCU),2129-08-04 12:45:00,2129-08-10 17:02:38,6.178912037037037

Q2.1 Ingestion

Import icustays.csv.gz as a tibble icustays_tble.

icustays_tble <- read_csv("~/mimic/icu/icustays.csv.gz")

Q2.2 Summary and visualization

How many unique subject_id? Can a subject_id have multiple ICU stays? Summarize the number of ICU stays per subject_id by graphs.

unique_subjects <- icustays_tble %>% 
  summarise(unique_subjects = n_distinct(subject_id))
print(unique_subjects)
# A tibble: 1 × 1
  unique_subjects
            <int>
1           50920

ANSWER: There are 50,920 unique subject_id in total.

icu_stays_summary <- icustays_tble %>%
  group_by(subject_id) %>%
  summarise(icu_stays = n_distinct(stay_id)) %>%
  ungroup() 

head(icu_stays_summary 
     %>% arrange(desc(icu_stays)))
# A tibble: 6 × 2
  subject_id icu_stays
       <dbl>     <int>
1   18358138        37
2   12468016        33
3   17585185        33
4   13269859        30
5   18676703        26
6   11281568        25

ANSWER: A subject_id can have multiple ICU stays.

ggplot(icu_stays_summary, aes(x = icu_stays)) +
  geom_histogram(binwidth = 1, fill = "blue") +
  theme_bw() +
  labs(title = "Number of ICU Stays per Subject",
       x = "Number of ICU Stays",
       y = "Count of Subjects")

ANSWER: According to this graph summary, most of subjects have a number of ICU Stays less than 3. Number of ICU Stays equals to 1 takes up a significant portion.

Q3. admissions data

Information of the patients admitted into hospital is available in admissions.csv.gz. See https://mimic.mit.edu/docs/iv/modules/hosp/admissions/ for details of each field in this file. The first 10 lines are

zcat < ~/mimic/hosp/admissions.csv.gz | head
subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P51VDL,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0
10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,,EW EMER.,P6957U,WALK-IN/SELF REFERRAL,HOME HEALTH CARE,Medicare,ENGLISH,MARRIED,WHITE,2160-11-20 20:36:00,2160-11-21 03:20:00,0
10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,,EU OBSERVATION,P63AD6,PHYSICIAN REFERRAL,,Medicare,ENGLISH,MARRIED,WHITE,2160-12-27 18:32:00,2160-12-28 16:07:00,0
10000108,27250926,2163-09-27 23:17:00,2163-09-28 09:04:00,,EU OBSERVATION,P38XXV,EMERGENCY ROOM,,Other,ENGLISH,SINGLE,WHITE,2163-09-27 16:18:00,2163-09-28 09:04:00,0
10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,,EU OBSERVATION,P2358X,EMERGENCY ROOM,,Other,ENGLISH,DIVORCED,WHITE,2181-11-14 21:51:00,2181-11-15 09:57:00,0

Q3.1 Ingestion

Import admissions.csv.gz as a tibble admissions_tble.

admissions_tble <- read_csv("~/mimic/hosp/admissions.csv.gz")

Q3.2 Summary and visualization

Summarize the following information by graphics and explain any patterns you see.

  • number of admissions per patient
  • admission hour (anything unusual?)
  • admission minute (anything unusual?)
  • length of hospital stay (from admission to discharge) (anything unusual?)
#number of admissions per patient 
admissions_summary <- admissions_tble %>%
  group_by(subject_id) %>%
  summarise(admissions = n_distinct(hadm_id)) %>%
  ungroup() 

ggplot(admissions_summary, aes(x = admissions)) +
  geom_histogram(binwidth = 1, fill = "blue") +
  theme_bw() +
  labs(title = "Number of Admissions per Patient",
       x = "Number of Admissions",
       y = "Count of Patients")

ANSWER: 1. Number of patients keeps decreasing as the number of admissions increases. Most of patients have number of admissions which is less than 25.

#admission hour
admission_time_summary <- admissions_tble %>%
  mutate(admission_hour = hour(admittime),
         admission_minute = minute(admittime),
         length_of_stay = as.numeric(difftime(dischtime, admittime, units = "days")))

ggplot(admission_time_summary, aes(x = admission_hour)) +
  geom_histogram(bins = 24, fill = "cornflowerblue", color = "black") +
  labs(title = "Distribution of Admission Hours",
       x = "Hour of the Day",
       y = "Frequency") +
  theme_bw()

#timezone <- attr(admissions_tble$admittime, "tzone")
#print(paste("Dataset timezone:", timezone))

ANSWER: 2. For admission hour, the dips appears in late evening after 2am and hour around 8 to 13. There is a peak in 7am which may reflect the routine admission activity of hospitals. The peaks during the evening hours might be unusual as they suggest a high volume of admissions outside the typical working hours of elective procedures. It might indicate a large number of emergency admissions during these times.

# Visualize Admission Minute
ggplot(admission_time_summary, aes(x = admission_minute)) +
  geom_histogram(bins = 60, fill = "lightgreen", color = "black") +
  labs(title = "Distribution of Admission Minutes",
       x = "Minute of the Hour",
       y = "Frequency") +
  theme_bw()

ANSWER: 3. This graph depicts the distribution of admission minutes. There are significant spikes at the 0-minute mark and smaller ones at 15, 30, and 45 minutes after the hour. This pattern suggests that admissions are often recorded on the hour or at 15-minute intervals.

# Visualize Length of Hospital Stay
ggplot(admission_time_summary, aes(x = length_of_stay)) +
  geom_histogram(binwidth = 1, fill = "salmon", color = "black") +
  labs(title = "Distribution of Length of Hospital Stay",
       x = "Length of Stay (Days)",
       y = "Frequency") +
  theme_bw()

ANSWER: 4. This graph indicates a long tail distribution of the hospital stay length. Most of patients stay in hospital for a duration which is less than 25 days. Meanwhile there are patients with very long stays. These could be outliers or cases that required prolonged treatment.

According to the MIMIC-IV documentation,

All dates in the database have been shifted to protect patient confidentiality. Dates will be internally consistent for the same patient, but randomly distributed in the future. Dates of birth which occur in the present time are not true dates of birth. Furthermore, dates of birth which occur before the year 1900 occur if the patient is older than 89. In these cases, the patient’s age at their first admission has been fixed to 300.

Q4. patients data

Patient information is available in patients.csv.gz. See https://mimic.mit.edu/docs/iv/modules/hosp/patients/ for details of each field in this file. The first 10 lines are

zcat < ~/mimic/hosp/patients.csv.gz | head
subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
10000032,F,52,2180,2014 - 2016,2180-09-09
10000048,F,23,2126,2008 - 2010,
10000068,F,19,2160,2008 - 2010,
10000084,M,72,2160,2017 - 2019,2161-02-13
10000102,F,27,2136,2008 - 2010,
10000108,M,25,2163,2014 - 2016,
10000115,M,24,2154,2017 - 2019,
10000117,F,48,2174,2008 - 2010,
10000178,F,59,2157,2017 - 2019,

Q4.1 Ingestion

Import patients.csv.gz (https://mimic.mit.edu/docs/iv/modules/hosp/patients/) as a tibble patients_tble.

patients_tble <- read_csv("~/mimic/hosp/patients.csv.gz")
patients_tble
# A tibble: 299,712 × 6
   subject_id gender anchor_age anchor_year anchor_year_group dod       
        <dbl> <chr>       <dbl>       <dbl> <chr>             <date>    
 1   10000032 F              52        2180 2014 - 2016       2180-09-09
 2   10000048 F              23        2126 2008 - 2010       NA        
 3   10000068 F              19        2160 2008 - 2010       NA        
 4   10000084 M              72        2160 2017 - 2019       2161-02-13
 5   10000102 F              27        2136 2008 - 2010       NA        
 6   10000108 M              25        2163 2014 - 2016       NA        
 7   10000115 M              24        2154 2017 - 2019       NA        
 8   10000117 F              48        2174 2008 - 2010       NA        
 9   10000178 F              59        2157 2017 - 2019       NA        
10   10000248 M              34        2192 2014 - 2016       NA        
# ℹ 299,702 more rows

Q4.2 Summary and visualization

Summarize variables gender and anchor_age by graphics, and explain any patterns you see.

#Gender
ggplot(patients_tble, aes(x = gender)) +
  geom_bar(fill = "blue", color = "black") +
  labs(title = "Distribution of Patient's Gender",
       x = "Gender",
       y = "Counts") +
  theme_bw()

ANSWER: 1. The gender of patients is basically evenly distributed, with the number of female patients slightly bigger than that of male patients.

#AGE
ggplot(patients_tble, aes(x = anchor_age)) +
  geom_histogram(binwidth = 1, fill = "darkorange", color = "black") +
  labs(title = "Distribution of Anchor Age",
       x = "Age",
       y = "Frequency") +
  theme_minimal()

ANSWER: 2. There are two peaks for age of patients around age 23 and age 91. The frequency of age around 30 and 75 are relatively small. Note that according to the dataset document, if a patient’s anchor_age is over 89 in the anchor_year then their anchor_age is set to 91, regardless of how old they actually were, which explains the reason why we have a high volume in age 91.

Q5. Lab results

labevents.csv.gz (https://mimic.mit.edu/docs/iv/modules/hosp/labevents/) contains all laboratory measurements for patients. The first 10 lines are

zcat < ~/mimic/hosp/labevents.csv.gz | head
labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments
1,10000032,,45421181,51237,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,1.4,1.4,,0.9,1.1,abnormal,ROUTINE,
2,10000032,,45421181,51274,P28Z0X,2180-03-23 11:51:00,2180-03-23 15:15:00,___,15.1,sec,9.4,12.5,abnormal,ROUTINE,VERIFIED.
3,10000032,,52958335,50853,P28Z0X,2180-03-23 11:51:00,2180-03-25 11:06:00,___,15,ng/mL,30,60,abnormal,ROUTINE,NEW ASSAY IN USE ___: DETECTS D2 AND D3 25-OH ACCURATELY.
4,10000032,,52958335,50861,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,102,102,IU/L,0,40,abnormal,ROUTINE,
5,10000032,,52958335,50862,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,3.3,3.3,g/dL,3.5,5.2,abnormal,ROUTINE,
6,10000032,,52958335,50863,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,109,109,IU/L,35,105,abnormal,ROUTINE,
7,10000032,,52958335,50864,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,___,8,ng/mL,0,8.7,,ROUTINE,MEASURED BY ___.
8,10000032,,52958335,50868,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,12,12,mEq/L,8,20,,ROUTINE,
9,10000032,,52958335,50878,P28Z0X,2180-03-23 11:51:00,2180-03-23 16:40:00,143,143,IU/L,0,40,abnormal,ROUTINE,

d_labitems.csv.gz (https://mimic.mit.edu/docs/iv/modules/hosp/d_labitems/) is the dictionary of lab measurements.

zcat < ~/mimic/hosp/d_labitems.csv.gz | head
itemid,label,fluid,category
50801,Alveolar-arterial Gradient,Blood,Blood Gas
50802,Base Excess,Blood,Blood Gas
50803,"Calculated Bicarbonate, Whole Blood",Blood,Blood Gas
50804,Calculated Total CO2,Blood,Blood Gas
50805,Carboxyhemoglobin,Blood,Blood Gas
50806,"Chloride, Whole Blood",Blood,Blood Gas
50808,Free Calcium,Blood,Blood Gas
50809,Glucose,Blood,Blood Gas
50810,"Hematocrit, Calculated",Blood,Blood Gas

We are interested in the lab measurements of creatinine (50912), potassium (50971), sodium (50983), chloride (50902), bicarbonate (50882), hematocrit (51221), white blood cell count (51301), and glucose (50931). Retrieve a subset of labevents.csv.gz that only containing these items for the patients in icustays_tble. Further restrict to the last available measurement (by storetime) before the ICU stay. The final labevents_tble should have one row per ICU stay and columns for each lab measurement.

Hint: Use the Parquet format you generated in Homework 2. For reproducibility, make labevents_pq folder available at the current working directory hw3, for example, by a symbolic link.

ANSWER: Read labevents and d_labitems data:

d_labitems_tble <- read_csv("~/mimic/hosp/d_labitems.csv.gz") %>%
  select(itemid, label)

labevents_dataset <- open_dataset("labevents_pq", format = "parquet")
labevents_data_label <- labevents_dataset %>%
  select(subject_id, itemid, charttime, storetime, valuenum) %>%
  filter(itemid %in% 
           c(50912, 50971, 50983, 50902, 50882, 51221, 51301, 50931)) %>%
  collect() %>%
  left_join(d_labitems_tble, by = "itemid")

Join processed labevents data and icustays data to get items for the patients in icustays_tble. Restrict to the last available measurement (by storetime) after the ICU stay (by intime). For a single stay_id, remove duplicated itemid by only keeping the item with latest storetime and charttime:

labevents_icu <- labevents_data_label %>%
  filter(subject_id %in% icustays_tble$subject_id) %>%
  left_join(select(icustays_tble, subject_id, stay_id, intime), 
            by = c("subject_id")) %>%
  group_by(stay_id, itemid) %>%
  filter(storetime < intime) %>%
  arrange(subject_id, stay_id, itemid, desc(storetime), desc(charttime)) %>%
  group_by(subject_id, stay_id, itemid) %>%
  slice_head(n = 1) %>%
  ungroup()
Warning in left_join(., select(icustays_tble, subject_id, stay_id, intime), : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 845 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.

Transfer the label and valuenum columns to get value for each lab events:

labevents_tble <- labevents_icu %>%
  select(subject_id, stay_id, label, valuenum) %>%
  group_by(subject_id) %>%
  pivot_wider(
    names_from = label,  
    values_from = valuenum 
  ) %>%
  ungroup()
labevents_tble <- labevents_tble %>%
  rename(white_blood_cells = `White Blood Cells`)
labevents_tble
# A tibble: 68,467 × 10
   subject_id  stay_id Bicarbonate Chloride Creatinine Glucose Potassium Sodium
        <dbl>    <dbl>       <dbl>    <dbl>      <dbl>   <dbl>     <dbl>  <dbl>
 1   10000032 39553978          25       95        0.7     102       6.7    126
 2   10000980 39765666          21      109        2.3      89       3.9    144
 3   10001217 34592300          30      104        0.5      87       4.1    142
 4   10001217 37067082          22      108        0.6     112       4.2    142
 5   10001725 31205490          NA       98       NA        NA       4.1    139
 6   10001884 37510196          30       88        1.1     141       4.5    130
 7   10002013 39060235          24      102        0.9     288       3.5    137
 8   10002155 31090461          23       98        2.8     117       4.9    135
 9   10002155 32358465          26       85        1.4     133       5.7    120
10   10002155 33685454          24      105        1.1     138       4.6    139
# ℹ 68,457 more rows
# ℹ 2 more variables: Hematocrit <dbl>, white_blood_cells <dbl>

Q6. Vitals from charted events

chartevents.csv.gz (https://mimic.mit.edu/docs/iv/modules/icu/chartevents/) contains all the charted data available for a patient. During their ICU stay, the primary repository of a patient’s information is their electronic chart. The itemid variable indicates a single measurement type in the database. The value variable is the value measured for itemid. The first 10 lines of chartevents.csv.gz are

zcat < ~/mimic/icu/chartevents.csv.gz | head
subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
10000032,29079034,39553978,47007,2180-07-23 21:01:00,2180-07-23 22:15:00,220179,82,82,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 21:01:00,2180-07-23 22:15:00,220180,59,59,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 21:01:00,2180-07-23 22:15:00,220181,63,63,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220045,94,94,bpm,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220179,85,85,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220180,55,55,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220181,62,62,mmHg,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220210,20,20,insp/min,0
10000032,29079034,39553978,47007,2180-07-23 22:00:00,2180-07-23 22:15:00,220277,95,95,%,0

d_items.csv.gz (https://mimic.mit.edu/docs/iv/modules/icu/d_items/) is the dictionary for the itemid in chartevents.csv.gz.

zcat < ~/mimic/icu/d_items.csv.gz | head
itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
220001,Problem List,Problem List,chartevents,General,,Text,,
220003,ICU Admission date,ICU Admission date,datetimeevents,ADT,,Date and time,,
220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
220046,Heart rate Alarm - High,HR Alarm - High,chartevents,Alarms,bpm,Numeric,,
220047,Heart Rate Alarm - Low,HR Alarm - Low,chartevents,Alarms,bpm,Numeric,,
220048,Heart Rhythm,Heart Rhythm,chartevents,Routine Vital Signs,,Text,,
220050,Arterial Blood Pressure systolic,ABPs,chartevents,Routine Vital Signs,mmHg,Numeric,90,140
220051,Arterial Blood Pressure diastolic,ABPd,chartevents,Routine Vital Signs,mmHg,Numeric,60,90
220052,Arterial Blood Pressure mean,ABPm,chartevents,Routine Vital Signs,mmHg,Numeric,,

We are interested in the vitals for ICU patients: heart rate (220045), systolic non-invasive blood pressure (220179), diastolic non-invasive blood pressure (220180), body temperature in Fahrenheit (223761), and respiratory rate (220210). Retrieve a subset of chartevents.csv.gz only containing these items for the patients in icustays_tble. Further restrict to the first vital measurement within the ICU stay. The final chartevents_tble should have one row per ICU stay and columns for each vital measurement.

Hint: Use the Parquet format you generated in Homework 2. For reproducibility, make chartevents_pq folder available at the current working directory, for example, by a symbolic link.

ANSWER: Read chartevent data and attach label for each item:

#Read chartevent data
chartevents_parquet <- open_dataset("chartevents_pq", format = "parquet")
  
parquet_data <- chartevents_parquet %>%
  select(subject_id, stay_id, itemid, charttime, storetime, valuenum) %>%
  filter(itemid %in% c(220045, 220181, 220179, 223761, 220210)) 

chartevents_tibble <- parquet_data %>% collect()

d_items_data <- read_csv("~/mimic/icu/d_items.csv.gz")
d_items_data <- d_items_data %>%
  select(itemid, label, abbreviation)
chartevents_data <- chartevents_tibble %>%
  left_join(d_items_data, by = "itemid")

Join processed chartevents data and icustays data to get items for the patients in icustays_tble. Restrict to the the first vital measurement between the ICU stay (from intime to outtime). For a single stay_id, remove duplicated itemid by only keeping the item with latest storetime and charttime:

chartevents_icu <- chartevents_data %>%
  filter(subject_id %in% icustays_tble$subject_id) %>%
  left_join(select(icustays_tble, subject_id, intime, outtime), 
            by = c("subject_id")) %>%
  group_by(subject_id, stay_id, itemid) %>%
  filter(charttime >= intime, charttime <= outtime) %>%
  arrange(subject_id, stay_id, itemid, desc(storetime), desc(charttime)) %>%
  group_by(subject_id, stay_id, itemid) %>%
  slice_head(n = 1) %>%
  ungroup()
Warning in left_join(., select(icustays_tble, subject_id, intime, outtime), : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 558 of `x` matches multiple rows in `y`.
ℹ Row 6 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.

Transfer the label and valuenum columns to get value for each chart events:

chartevents_tble <- chartevents_icu %>%
  select(subject_id, stay_id, label, valuenum) %>%
  group_by(subject_id) %>%
  pivot_wider(
    names_from = label,  
    values_from = valuenum 
  ) %>%
  ungroup()
chartevents_tble <- chartevents_tble %>%
  rename(
    heart_rate = `Heart Rate`,
    non_invasive_blood_pressure_systolic = 
      `Non Invasive Blood Pressure systolic`,
    non_invasive_blood_pressure_mean = `Non Invasive Blood Pressure mean`,
    respiratory_rate = `Respiratory Rate`,
    temperature_fahrenheit = `Temperature Fahrenheit`
  )
chartevents_tble
# A tibble: 73,164 × 7
   subject_id  stay_id heart_rate non_invasive_blood_pr…¹ non_invasive_blood_p…²
        <dbl>    <int>      <dbl>                   <dbl>                  <dbl>
 1   10000032 39553978         94                      85                     62
 2   10000980 39765666         69                     131                     85
 3   10001217 34592300         80                     107                     84
 4   10001217 37067082         93                     144                     99
 5   10001725 31205490         73                      91                     65
 6   10001884 37510196         74                      86                     60
 7   10002013 39060235         94                     106                     72
 8   10002155 31090461        101                     116                     74
 9   10002155 32358465         75                      66                     37
10   10002155 33685454        106                     138                     85
# ℹ 73,154 more rows
# ℹ abbreviated names: ¹​non_invasive_blood_pressure_systolic,
#   ²​non_invasive_blood_pressure_mean
# ℹ 2 more variables: respiratory_rate <dbl>, temperature_fahrenheit <dbl>

Q7. Putting things together

Let us create a tibble mimic_icu_cohort for all ICU stays, where rows are all ICU stays of adults (age at intime >= 18) and columns contain at least following variables

  • all variables in icustays_tble
  • all variables in admissions_tble
  • all variables in patients_tble
  • the last lab measurements before the ICU stay in labevents_tble
  • the first vital measurements during the ICU stay in chartevents_tble

The final mimic_icu_cohort should have one row per ICU stay and columns for each variable.

ANSWER: Join table:

mimic_icu_cohort <- icustays_tble %>%
  left_join(admissions_tble, by = c("subject_id", "hadm_id")) %>%
  left_join(patients_tble, by = "subject_id") %>%
  left_join(labevents_tble, by = c("stay_id", "subject_id")) %>%
  left_join(chartevents_tble, by = c("stay_id", "subject_id"))

#mimic_icu_cohort

Actual intime age calculation:

#Calculate real age at intime and filter
mimic_icu_cohort <- mimic_icu_cohort %>%
  mutate(
    in_year = year(intime),
    year_difference = as.numeric(in_year) - as.numeric(anchor_year),
    age_at_intime = ifelse(anchor_age > 89, 91, anchor_age) + year_difference
  ) %>%
  select(-in_year, -year_difference) %>%
  filter(age_at_intime >= 18)
mimic_icu_cohort
# A tibble: 73,181 × 41
   subject_id  hadm_id  stay_id first_careunit last_careunit intime             
        <dbl>    <dbl>    <dbl> <chr>          <chr>         <dttm>             
 1   10000032 29079034 39553978 Medical Inten… Medical Inte… 2180-07-23 14:00:00
 2   10000980 26913865 39765666 Medical Inten… Medical Inte… 2189-06-27 08:42:00
 3   10001217 24597018 37067082 Surgical Inte… Surgical Int… 2157-11-20 19:18:02
 4   10001217 27703517 34592300 Surgical Inte… Surgical Int… 2157-12-19 15:42:24
 5   10001725 25563031 31205490 Medical/Surgi… Medical/Surg… 2110-04-11 15:52:22
 6   10001884 26184834 37510196 Medical Inten… Medical Inte… 2131-01-11 04:20:05
 7   10002013 23581541 39060235 Cardiac Vascu… Cardiac Vasc… 2160-05-18 10:00:53
 8   10002155 20345487 32358465 Medical Inten… Medical Inte… 2131-03-09 21:33:00
 9   10002155 23822395 33685454 Coronary Care… Coronary Car… 2129-08-04 12:45:00
10   10002155 28994087 31090461 Medical/Surgi… Medical/Surg… 2130-09-24 00:50:00
# ℹ 73,171 more rows
# ℹ 35 more variables: outtime <dttm>, los <dbl>, admittime <dttm>,
#   dischtime <dttm>, deathtime <dttm>, admission_type <chr>,
#   admit_provider_id <chr>, admission_location <chr>,
#   discharge_location <chr>, insurance <chr>, language <chr>,
#   marital_status <chr>, race <chr>, edregtime <dttm>, edouttime <dttm>,
#   hospital_expire_flag <dbl>, gender <chr>, anchor_age <dbl>, …

Q8. Exploratory data analysis (EDA)

Summarize the following information about the ICU stay cohort mimic_icu_cohort using appropriate numerics or graphs:

  • Length of ICU stay los vs demographic variables (race, insurance, marital_status, gender, age at intime)

Race:

race_summary <- mimic_icu_cohort %>%
  group_by(race) %>%
  summarise(
    mean_los = mean(los, na.rm = TRUE),
    median_los = median(los, na.rm = TRUE),
    IQR_los = IQR(los, na.rm = TRUE)
  )
race_summary
# A tibble: 33 × 4
   race                          mean_los median_los IQR_los
   <chr>                            <dbl>      <dbl>   <dbl>
 1 AMERICAN INDIAN/ALASKA NATIVE     4.46       2.09    4.01
 2 ASIAN                             3.49       1.93    2.70
 3 ASIAN - ASIAN INDIAN              4.20       1.95    2.81
 4 ASIAN - CHINESE                   3.36       1.86    2.28
 5 ASIAN - KOREAN                    4.23       2.08    2.85
 6 ASIAN - SOUTH EAST ASIAN          3.04       1.82    1.90
 7 BLACK/AFRICAN                     3.82       2.02    3.04
 8 BLACK/AFRICAN AMERICAN            3.28       1.85    2.43
 9 BLACK/CAPE VERDEAN                3.22       1.70    2.53
10 BLACK/CARIBBEAN ISLAND            3.61       1.95    2.45
# ℹ 23 more rows
race_plot <- ggplot(mimic_icu_cohort, aes(x = race, y = los)) +
  geom_boxplot(size = 0.1, alpha = 0.5)+
  theme_bw() +
  theme(axis.text.x = element_text(angle = 50, hjust = 1))

print(race_plot)

Insurance:

insurance_summary <- mimic_icu_cohort %>%
  group_by(insurance) %>%
  summarise(
    mean_los = mean(los, na.rm = TRUE),
    median_los = median(los, na.rm = TRUE),
    IQR_los = IQR(los, na.rm = TRUE)
  )

print(insurance_summary)
# A tibble: 3 × 4
  insurance mean_los median_los IQR_los
  <chr>        <dbl>      <dbl>   <dbl>
1 Medicaid      3.46       1.81    2.46
2 Medicare      3.48       2.01    2.71
3 Other         3.42       1.87    2.55
insurance_plot <- ggplot(mimic_icu_cohort, aes(x = insurance, y = los)) +
  geom_boxplot(size = 0.1, alpha = 0.5) +
  theme_bw() +
  theme(axis.text.x = element_text())

print(insurance_plot)

Marital Status:

summary_by_marital_status <- mimic_icu_cohort %>% 
  group_by(marital_status) %>% 
  summarise(mean_los = mean(los, na.rm = TRUE), 
            median_los = median(los, na.rm = TRUE), 
            IQR_los = IQR(los, na.rm = TRUE))
print(summary_by_marital_status)
# A tibble: 5 × 4
  marital_status mean_los median_los IQR_los
  <chr>             <dbl>      <dbl>   <dbl>
1 DIVORCED           3.39       1.90    2.65
2 MARRIED            3.46       1.94    2.56
3 SINGLE             3.40       1.87    2.59
4 WIDOWED            3.13       1.93    2.41
5 <NA>               4.28       2.20    3.70
p_marital_status <- ggplot(mimic_icu_cohort, aes(x = marital_status, y = los)) + 
  geom_boxplot(size = 0.1, alpha = 0.5) + 
  theme_bw() +
  theme(axis.text.x = element_text())
print(p_marital_status)

Gender:

summary_by_gender <- mimic_icu_cohort %>% 
  group_by(gender) %>% 
  summarise(mean_los = mean(los, na.rm = TRUE), 
            median_los = median(los, na.rm = TRUE), 
            IQR_los = IQR(los, na.rm = TRUE))

print(summary_by_gender)
# A tibble: 2 × 4
  gender mean_los median_los IQR_los
  <chr>     <dbl>      <dbl>   <dbl>
1 F          3.34       1.91    2.60
2 M          3.54       1.94    2.65
p_gender <- ggplot(mimic_icu_cohort, aes(x = gender, y = los)) + 
  geom_boxplot(size = 0.1, alpha = 0.5) +
  theme_bw() +
  theme(axis.text.x = element_text())
print(p_gender)

Age at intime:

p_age <- ggplot(mimic_icu_cohort, aes(x = age_at_intime, y = los)) + 
  geom_smooth(method = lm) +
  theme_bw()
print(p_age)

  • Length of ICU stay los vs the last available lab measurements before ICU stay

Bicarbonate

p_Bicarbonate <- ggplot(mimic_icu_cohort, aes(x = Bicarbonate, y = los)) + 
  geom_smooth(method = lm) +
  theme_bw()
print(p_Bicarbonate)
Warning: Removed 9050 rows containing non-finite values (`stat_smooth()`).

Chloride

p_Chloride <- ggplot(mimic_icu_cohort, aes(x = Chloride, y = los)) + 
  geom_smooth(method = lm) +
  theme_bw()
print(p_Chloride)
Warning: Removed 8883 rows containing non-finite values (`stat_smooth()`).

Creatinine

p_Creatinine <- ggplot(mimic_icu_cohort, aes(x = Creatinine, y = los)) + 
  geom_smooth(method = lm) +
  theme_bw()
print(p_Creatinine)
Warning: Removed 5770 rows containing non-finite values (`stat_smooth()`).

Glucose

p_Glucose <- ggplot(mimic_icu_cohort, aes(x = Glucose, y = los)) + 
  geom_smooth(method = lm) +
  theme_bw()
print(p_Glucose)
Warning: Removed 9099 rows containing non-finite values (`stat_smooth()`).

Potassium

p_Potassium <- ggplot(mimic_icu_cohort, aes(x = Potassium, y = los)) + 
  geom_smooth(method = lm) +
  theme_bw()
print(p_Potassium)
Warning: Removed 8901 rows containing non-finite values (`stat_smooth()`).

Sodium

p_Sodium <- ggplot(mimic_icu_cohort, aes(x = Sodium, y = los)) + 
  geom_smooth(method = lm) +
  theme_bw()
print(p_Sodium)
Warning: Removed 8872 rows containing non-finite values (`stat_smooth()`).

Hematocrit

p_Hematocrit <- ggplot(mimic_icu_cohort, aes(x = Hematocrit, y = los)) + 
  geom_smooth(method = lm) +
  theme_bw()
print(p_Hematocrit)
Warning: Removed 5017 rows containing non-finite values (`stat_smooth()`).

White Blood Cells

p_white_blood_cells <- ggplot(mimic_icu_cohort, aes(x = white_blood_cells, y = los)) + 
  geom_smooth(method = lm) +
  theme_bw()
print(p_white_blood_cells)
Warning: Removed 5094 rows containing non-finite values (`stat_smooth()`).

  • Length of ICU stay los vs the average vital measurements within the first hour of ICU stay

Heart Rate

filtered_data <- mimic_icu_cohort %>%
  filter(heart_rate <= 500)

p_heart_rate <- ggplot(filtered_data, aes(x = heart_rate, y = los)) +
  geom_smooth(method = "lm") +
  theme_bw()

print(p_heart_rate)

Non Invasive Blood Pressure systolic

filtered_data <- mimic_icu_cohort %>%
  filter(non_invasive_blood_pressure_systolic <= 500)

p_non_invasive_blood_pressure_systolic <- ggplot(filtered_data, 
                       aes(x = non_invasive_blood_pressure_systolic, y = los)) +
  geom_smooth(method = "lm") +
  theme_bw()

print(p_non_invasive_blood_pressure_systolic)

Non Invasive Blood Pressure mean

filtered_data <- mimic_icu_cohort %>%
  filter(non_invasive_blood_pressure_mean <= 500)

p_non_invasive_blood_pressure_mean <- ggplot(filtered_data, 
                       aes(x = non_invasive_blood_pressure_mean, y = los)) +
  geom_smooth(method = "lm") +
  theme_bw()

print(p_non_invasive_blood_pressure_mean)

Respiratory Rate

filtered_data <- mimic_icu_cohort %>%
  filter(respiratory_rate <= 500)

p_respiratory_rate <- ggplot(filtered_data, 
                       aes(x = respiratory_rate, y = los)) +
  geom_smooth(method = "lm") +
  theme_bw()

print(p_respiratory_rate)

Temperature Fahrenheit

filtered_data <- mimic_icu_cohort %>%
  filter(temperature_fahrenheit <= 900)

p_temperature_fahrenheit <- ggplot(filtered_data, 
                       aes(x = temperature_fahrenheit, y = los)) +
  geom_smooth(method = "lm") +
  theme_bw()

print(p_temperature_fahrenheit)

  • Length of ICU stay los vs first ICU unit
careunit_summary <- mimic_icu_cohort %>%
  group_by(first_careunit) %>%
  summarise(
    mean_los = mean(los, na.rm = TRUE),
    median_los = median(los, na.rm = TRUE),
    IQR_los = IQR(los, na.rm = TRUE)
  )

print(careunit_summary)
# A tibble: 9 × 4
  first_careunit                                   mean_los median_los IQR_los
  <chr>                                               <dbl>      <dbl>   <dbl>
1 Cardiac Vascular Intensive Care Unit (CVICU)         3.29       1.99    2.08
2 Coronary Care Unit (CCU)                             3.19       2.01    2.64
3 Medical Intensive Care Unit (MICU)                   3.26       1.83    2.54
4 Medical/Surgical Intensive Care Unit (MICU/SICU)     3.08       1.81    2.18
5 Neuro Intermediate                                   3.42       2.34    3.12
6 Neuro Stepdown                                       2.59       1.69    2.27
7 Neuro Surgical Intensive Care Unit (Neuro SICU)      6.30       3.65    5.95
8 Surgical Intensive Care Unit (SICU)                  3.84       1.97    2.95
9 Trauma SICU (TSICU)                                  3.83       1.94    2.95
careunit_plot <- ggplot(mimic_icu_cohort, aes(x = first_careunit, y = los)) +
  geom_boxplot(size = 0.1, alpha = 0.5) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 50, hjust = 1))

print(careunit_plot)